package com.charm.utils;

import java.util.List;
import java.util.ArrayList;
import java.io.File;
import java.io.FileReader;
import java.io.BufferedReader;

import org.ansj.domain.Term;
import org.ansj.domain.Result;
import org.ansj.library.UserDefineLibrary;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.ansj.recognition.impl.FilterRecognition;

/**
 * ANSJ自然语言分词操作类
 * @author gonglibin
 * 2017.08.22
 */

public class CmAnsj {
	public static class CmWord {
		public int		off;			// 偏移
		public String	val;			// 分词
		public String	nat;			// 词性
		
		public CmWord() {}
		public CmWord(int o, String v, String n) {off = o; val = v; nat = n;}
	}
	
	private final int CM_TWDS = 0X01;
	private final int CM_TNTS = 0X02;
	private final int CM_TDIC = 0X04;
	
	private List<CmWord> lst = new ArrayList<CmWord>();
	private FilterRecognition nts = new FilterRecognition();
	private FilterRecognition wds = new FilterRecognition();
	
	public static final String CM_CDIC = "./config/CmUserDict.dat";
	public static final String CM_CWDS = "./config/CmStopWords.dat";
	public static final String CM_CNTS = "./config/CmStopNatures.dat";

	/**
	 * CmAnsj构造函数
	 * @param n 停用属性配置文件
	 * @param w 停用词语配置文件
	 * @param d 用户自定义字典库
	 * @return 无
	 */
	public CmAnsj() {}
	public CmAnsj(String w, String n) {
		CmAnsjFilter(w, wds, CM_TWDS);
		CmAnsjFilter(n, nts, CM_TNTS);
	}
	public CmAnsj(String w, String n, String d) {
		CmAnsjFilter(w, wds, CM_TWDS);
		CmAnsjFilter(n, nts, CM_TNTS);
		CmAnsjFilter(d, null, CM_TDIC);
	}
	public CmAnsj(String w, String n, String d, String k) {
		CmAnsjFilter(w, wds, CM_TWDS);
		CmAnsjFilter(n, nts, CM_TNTS);
		CmAnsjFilter(d, null, CM_TDIC);
		CmAnsjKeysDict(k);
	}
	
	/**
	 * 打印分词结果链表
	 * @param 无
	 * @return 无
	 */
	public void CmAnsjPrint() {
		for (CmWord w : lst) {
			System.out.println(w.val + ": [" + w.off + ", " + w.nat + "]");
		}
	}
	
	/**
	 * 返回以空格分隔的语句
	 * @param 无
	 * @return rst 带空格的字符串
	 */
	public String CmAnsjToString() {
		String rst = new String();
		
		for (CmWord w : lst) {
			rst += w.val + " ";
		}
		
		return rst;
	}
	
	/**
	 * 返回以分词数组
	 * @param 无
	 * @return arr 分词数组
	 */
	public String[] CmAnsjToArray() {
		int idx = 0;
		String[] arr = new String[lst.size()];
		
		for (CmWord w : lst) arr[idx ++] = w.val;
		
		return arr;
	}
	
	/**
	 * 对文章进行精准切分
	 * @param s 语句或正文
	 * @return List<CmWord>结果链表
	 */
	public List<CmWord> CmAnsjParse(String s) {
		return CmAnsjBoxUp(ToAnalysis.parse(s).recognition(nts).recognition(wds));
	}
	
	/**
	 * 数据自定义格式装箱
	 * @param r Result结果集合对象
	 * @return List<CmWord>结果链表
	 */
	private List<CmWord> CmAnsjBoxUp(Result r) {
		lst.clear();
		
		for (Term v : r) {
			lst.add(new CmWord(v.getOffe(), v.getName(), v.getNatureStr()));
		}
		
		return lst;
	}
	
	/**
	 * 设置停用词、停用属性及追加用户字典
	 * @param f 配置文件路径名称
	 * @param r 过滤对象（属性或词语）
	 * @param t 过滤器类型（属性或词语）
	 * @return 无
	 */
	private void CmAnsjFilter(String f, FilterRecognition r, int t) {
		try {
			String buf = new String();
			BufferedReader br = new BufferedReader(new FileReader(f));
			
			while (null != (buf = br.readLine())) {
				switch (t) {
				case CM_TWDS:
					r.insertStopWord(buf);
					break;
				case CM_TNTS:
					r.insertStopNatures(buf);
					break;
				case CM_TDIC:
					UserDefineLibrary.insertWord(buf);
					break;
				default:
					break;
				}
			}
			
			br.close();
		} catch(Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 把关键词作为用户字典进行定义
	 * @param f 关键词配置文件路径
	 * @return 无
	 */
	private void CmAnsjKeysDict(String f) {
		try {
			File fds = new File(f);
			for (File v : fds.listFiles()) {
				String buf = new String();
				BufferedReader brd = new BufferedReader(new FileReader(v));
				while (null != (buf = brd.readLine())) {
					UserDefineLibrary.insertWord(buf);
				}
				brd.close();
			}
		} catch(Exception e) {
			e.printStackTrace();
		}
	}
}